Name(s): David Sun, Yijun Luo
Website Link: https://jackkkkkkdzk.github.io/Power-Outage-Investigation/
import pandas as pd
import numpy as np
import os
import plotly.express as px
pd.options.plotting.backend = 'plotly'
#load outage data
data_path = os.path.join('data', 'outage.xlsx')
raw_outage = pd.read_excel(data_path, skiprows=[0,1,2,3,4,6], usecols='B:BE')
raw_outage
| OBS | YEAR | MONTH | U.S._STATE | POSTAL.CODE | NERC.REGION | CLIMATE.REGION | ANOMALY.LEVEL | CLIMATE.CATEGORY | OUTAGE.START.DATE | ... | POPPCT_URBAN | POPPCT_UC | POPDEN_URBAN | POPDEN_UC | POPDEN_RURAL | AREAPCT_URBAN | AREAPCT_UC | PCT_LAND | PCT_WATER_TOT | PCT_WATER_INLAND | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2011 | 7.0 | Minnesota | MN | MRO | East North Central | -0.3 | normal | 2011-07-01 | ... | 73.27 | 15.28 | 2279.0 | 1700.5 | 18.2 | 2.14 | 0.60 | 91.592666 | 8.407334 | 5.478743 |
| 1 | 2 | 2014 | 5.0 | Minnesota | MN | MRO | East North Central | -0.1 | normal | 2014-05-11 | ... | 73.27 | 15.28 | 2279.0 | 1700.5 | 18.2 | 2.14 | 0.60 | 91.592666 | 8.407334 | 5.478743 |
| 2 | 3 | 2010 | 10.0 | Minnesota | MN | MRO | East North Central | -1.5 | cold | 2010-10-26 | ... | 73.27 | 15.28 | 2279.0 | 1700.5 | 18.2 | 2.14 | 0.60 | 91.592666 | 8.407334 | 5.478743 |
| 3 | 4 | 2012 | 6.0 | Minnesota | MN | MRO | East North Central | -0.1 | normal | 2012-06-19 | ... | 73.27 | 15.28 | 2279.0 | 1700.5 | 18.2 | 2.14 | 0.60 | 91.592666 | 8.407334 | 5.478743 |
| 4 | 5 | 2015 | 7.0 | Minnesota | MN | MRO | East North Central | 1.2 | warm | 2015-07-18 | ... | 73.27 | 15.28 | 2279.0 | 1700.5 | 18.2 | 2.14 | 0.60 | 91.592666 | 8.407334 | 5.478743 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1529 | 1530 | 2011 | 12.0 | North Dakota | ND | MRO | West North Central | -0.9 | cold | 2011-12-06 | ... | 59.90 | 19.90 | 2192.2 | 1868.2 | 3.9 | 0.27 | 0.10 | 97.599649 | 2.401765 | 2.401765 |
| 1530 | 1531 | 2006 | NaN | North Dakota | ND | MRO | West North Central | NaN | NaN | NaT | ... | 59.90 | 19.90 | 2192.2 | 1868.2 | 3.9 | 0.27 | 0.10 | 97.599649 | 2.401765 | 2.401765 |
| 1531 | 1532 | 2009 | 8.0 | South Dakota | SD | RFC | West North Central | 0.5 | warm | 2009-08-29 | ... | 56.65 | 26.73 | 2038.3 | 1905.4 | 4.7 | 0.30 | 0.15 | 98.307744 | 1.692256 | 1.692256 |
| 1532 | 1533 | 2009 | 8.0 | South Dakota | SD | MRO | West North Central | 0.5 | warm | 2009-08-29 | ... | 56.65 | 26.73 | 2038.3 | 1905.4 | 4.7 | 0.30 | 0.15 | 98.307744 | 1.692256 | 1.692256 |
| 1533 | 1534 | 2000 | NaN | Alaska | AK | ASCC | NaN | NaN | NaN | NaT | ... | 66.02 | 21.56 | 1802.6 | 1276.0 | 0.4 | 0.05 | 0.02 | 85.761154 | 14.238846 | 2.901182 |
1534 rows × 56 columns
#cleaned and combined time columns
outage = raw_outage.copy()
outage['OUTAGE.START'] = pd.to_datetime(raw_outage['OUTAGE.START.DATE']) + pd.to_timedelta(raw_outage['OUTAGE.START.TIME'].apply(str))
outage['OUTAGE.RESTORATION'] = pd.to_datetime(raw_outage['OUTAGE.RESTORATION.DATE']) + pd.to_timedelta(raw_outage['OUTAGE.RESTORATION.TIME'].apply(str))
outage
| OBS | YEAR | MONTH | U.S._STATE | POSTAL.CODE | NERC.REGION | CLIMATE.REGION | ANOMALY.LEVEL | CLIMATE.CATEGORY | OUTAGE.START.DATE | ... | POPDEN_URBAN | POPDEN_UC | POPDEN_RURAL | AREAPCT_URBAN | AREAPCT_UC | PCT_LAND | PCT_WATER_TOT | PCT_WATER_INLAND | OUTAGE.START | OUTAGE.RESTORATION | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2011 | 7.0 | Minnesota | MN | MRO | East North Central | -0.3 | normal | 2011-07-01 | ... | 2279.0 | 1700.5 | 18.2 | 2.14 | 0.60 | 91.592666 | 8.407334 | 5.478743 | 2011-07-01 17:00:00 | 2011-07-03 20:00:00 |
| 1 | 2 | 2014 | 5.0 | Minnesota | MN | MRO | East North Central | -0.1 | normal | 2014-05-11 | ... | 2279.0 | 1700.5 | 18.2 | 2.14 | 0.60 | 91.592666 | 8.407334 | 5.478743 | 2014-05-11 18:38:00 | 2014-05-11 18:39:00 |
| 2 | 3 | 2010 | 10.0 | Minnesota | MN | MRO | East North Central | -1.5 | cold | 2010-10-26 | ... | 2279.0 | 1700.5 | 18.2 | 2.14 | 0.60 | 91.592666 | 8.407334 | 5.478743 | 2010-10-26 20:00:00 | 2010-10-28 22:00:00 |
| 3 | 4 | 2012 | 6.0 | Minnesota | MN | MRO | East North Central | -0.1 | normal | 2012-06-19 | ... | 2279.0 | 1700.5 | 18.2 | 2.14 | 0.60 | 91.592666 | 8.407334 | 5.478743 | 2012-06-19 04:30:00 | 2012-06-20 23:00:00 |
| 4 | 5 | 2015 | 7.0 | Minnesota | MN | MRO | East North Central | 1.2 | warm | 2015-07-18 | ... | 2279.0 | 1700.5 | 18.2 | 2.14 | 0.60 | 91.592666 | 8.407334 | 5.478743 | 2015-07-18 02:00:00 | 2015-07-19 07:00:00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1529 | 1530 | 2011 | 12.0 | North Dakota | ND | MRO | West North Central | -0.9 | cold | 2011-12-06 | ... | 2192.2 | 1868.2 | 3.9 | 0.27 | 0.10 | 97.599649 | 2.401765 | 2.401765 | 2011-12-06 08:00:00 | 2011-12-06 20:00:00 |
| 1530 | 1531 | 2006 | NaN | North Dakota | ND | MRO | West North Central | NaN | NaN | NaT | ... | 2192.2 | 1868.2 | 3.9 | 0.27 | 0.10 | 97.599649 | 2.401765 | 2.401765 | NaT | NaT |
| 1531 | 1532 | 2009 | 8.0 | South Dakota | SD | RFC | West North Central | 0.5 | warm | 2009-08-29 | ... | 2038.3 | 1905.4 | 4.7 | 0.30 | 0.15 | 98.307744 | 1.692256 | 1.692256 | 2009-08-29 22:54:00 | 2009-08-29 23:53:00 |
| 1532 | 1533 | 2009 | 8.0 | South Dakota | SD | MRO | West North Central | 0.5 | warm | 2009-08-29 | ... | 2038.3 | 1905.4 | 4.7 | 0.30 | 0.15 | 98.307744 | 1.692256 | 1.692256 | 2009-08-29 11:00:00 | 2009-08-29 14:01:00 |
| 1533 | 1534 | 2000 | NaN | Alaska | AK | ASCC | NaN | NaN | NaN | NaT | ... | 1802.6 | 1276.0 | 0.4 | 0.05 | 0.02 | 85.761154 | 14.238846 | 2.901182 | NaT | NaT |
1534 rows × 58 columns
#proportion of null values in consequence columns by cause category
null_val_pc = outage.groupby('CAUSE.CATEGORY')[['OUTAGE.DURATION', 'DEMAND.LOSS.MW', 'CUSTOMERS.AFFECTED']].apply(lambda x: x.isnull().sum()/len(x))
null_val_pc
| OUTAGE.DURATION | DEMAND.LOSS.MW | CUSTOMERS.AFFECTED | |
|---|---|---|---|
| CAUSE.CATEGORY | |||
| equipment failure | 0.083333 | 0.166667 | 0.500000 |
| fuel supply emergency | 0.254902 | 0.470588 | 0.862745 |
| intentional attack | 0.035885 | 0.557416 | 0.523923 |
| islanding | 0.043478 | 0.152174 | 0.260870 |
| public appeal | 0.000000 | 0.565217 | 0.695652 |
| severe weather | 0.024902 | 0.484928 | 0.060288 |
| system operability disruption | 0.031496 | 0.173228 | 0.346457 |
#duration has the least average amount of missing values by proportion
null_val_pc.mean()
OUTAGE.DURATION 0.067714 DEMAND.LOSS.MW 0.367174 CUSTOMERS.AFFECTED 0.464276 dtype: float64
#average duration effected by each category
px.bar(outage.groupby('CAUSE.CATEGORY')['OUTAGE.DURATION'].apply(lambda x: x.median()))
After some initial speculation, we discovered a few interesting information about the consequences of the power outage and the cause of the outage. There seems to be a strong association between the null values in the OUTAGE.DURATION, DEMAND.LOSS.MW, and CUSTOMERS.AFFECTED and the vandalism, uncontrolled loss categories.
*Some interesting questions:*
How does El Niño have an affect on the power outage?
Is the severity of outage related to seasonal weather?
Is the west coast more likely to experience severe outage compared to east coast?
What characteristics are associated with each category of cause?
#added column for occurances of power outage per state
outage['AVG_DUR_PER_STATE'] = outage.groupby('U.S._STATE')['OUTAGE.DURATION'].transform(lambda x: x.median())
/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/numpy/lib/nanfunctions.py:1117: RuntimeWarning: Mean of empty slice
#univariate analysis of U.S._STATE
fig_1 = px.choropleth(outage, locations='POSTAL.CODE',
locationmode='USA-states',
color='AVG_DUR_PER_STATE',
color_continuous_scale=px.colors.sequential.Reds,
scope="usa",
hover_name='U.S._STATE',
range_color=[1,2500],
title='Median Length of Power Outages in US (Jan, 2000 - July, 2016)'
)
fig_1
#eliminate outlier from visualizing outage duration
duration_iqr = np.quantile(outage['OUTAGE.DURATION'].dropna(), 0.75) - np.quantile(outage['OUTAGE.DURATION'].dropna(), 0.25)
duration_range = (np.quantile(outage['OUTAGE.DURATION'].dropna(), 0.25) - 1.5 * duration_iqr, np.quantile(outage['OUTAGE.DURATION'].dropna(), 0.75) + 1.5 * duration_iqr)
outage_duration_wo_outlier = pd.Series(filter(lambda x: (x > duration_range[0]) & (x < duration_range[1]), outage['OUTAGE.DURATION']))
outage_duration_wo_outlier
0 3060.0
1 1.0
2 3000.0
3 2550.0
4 1740.0
...
1327 0.0
1328 220.0
1329 720.0
1330 59.0
1331 181.0
Length: 1332, dtype: float64
#univariate analysis on the outage duration
fig_2 = px.histogram(outage_duration_wo_outlier,
histnorm='percent',
title='Distribution of Outage Durations in Minutes'
)
fig_2.update_xaxes(title_text='Duration (mins)')
fig_2.update_yaxes(title_text='Proportion')
fig_2.update_traces(name='Outage')
#proportion of outages caused by severe weather conditions by year
sever_weather_pc_year = outage.groupby('YEAR')['CAUSE.CATEGORY'].apply(lambda x: (x == 'severe weather').sum()/x.count())
fig_3 = px.bar(sever_weather_pc_year, title='Power Outages by Severe Weather by Year')
fig_3.update_yaxes(title_text='Proportion of Outages Caused by Severe Weather')
fig_3.update_traces(name='Severe Weather Outages')
fig_3
#bivariate scatter plot between TOTAL.SALES and POPULATION column
avg_usage = outage.groupby('U.S._STATE')['TOTAL.SALES'].mean()
population = outage.groupby('U.S._STATE')['POPULATION'].mean()
fig_4 = px.scatter(y=avg_usage,
x=population,
hover_name=population.index,
color=population.index,
title='Mean Total Sales of Each State vs Total Poluation of Each State'
)
fig_4.update_yaxes(title_text='Mean Total Sales')
fig_4.update_xaxes(title_text='Mean Population')
#mean outage duration measured by each state and cause category
outage.pivot_table(index='U.S._STATE',
columns='CAUSE.CATEGORY',
values='OUTAGE.DURATION',
aggfunc='mean')
| CAUSE.CATEGORY | equipment failure | fuel supply emergency | intentional attack | islanding | public appeal | severe weather | system operability disruption |
|---|---|---|---|---|---|---|---|
| U.S._STATE | |||||||
| Alabama | NaN | NaN | 77.000000 | NaN | NaN | 1421.750000 | NaN |
| Arizona | 138.500000 | NaN | 639.600000 | NaN | NaN | 25726.500000 | 384.500000 |
| Arkansas | 105.000000 | NaN | 547.833333 | 3.000000 | 1063.714286 | 2701.800000 | NaN |
| California | 524.809524 | 6154.60 | 946.458333 | 214.857143 | 2028.111111 | 2928.373134 | 363.666667 |
| Colorado | NaN | NaN | 117.000000 | 2.000000 | NaN | 2727.250000 | 279.750000 |
| Connecticut | NaN | NaN | 49.125000 | NaN | NaN | 2262.600000 | NaN |
| Delaware | 50.000000 | NaN | 38.918919 | NaN | NaN | 2153.500000 | NaN |
| District of Columbia | 159.000000 | NaN | NaN | NaN | NaN | 4764.111111 | NaN |
| Florida | 554.500000 | NaN | 50.000000 | NaN | 4320.000000 | 6420.192308 | 205.700000 |
| Georgia | NaN | NaN | 108.000000 | NaN | NaN | 1422.750000 | NaN |
| Hawaii | NaN | NaN | NaN | NaN | NaN | 997.500000 | 237.000000 |
| Idaho | NaN | NaN | 307.500000 | NaN | 1548.000000 | NaN | 179.666667 |
| Illinois | 149.000000 | 2761.00 | 1450.000000 | NaN | 120.000000 | 1650.700000 | NaN |
| Indiana | 1.000000 | 12240.00 | 421.875000 | 125.333333 | NaN | 4523.291667 | 4671.600000 |
| Iowa | NaN | NaN | 5657.800000 | NaN | NaN | 3353.666667 | NaN |
| Kansas | NaN | NaN | 561.000000 | NaN | 913.000000 | 9346.000000 | NaN |
| Kentucky | 652.000000 | 12570.00 | 108.000000 | NaN | NaN | 4480.111111 | NaN |
| Louisiana | 176.333333 | 28170.00 | NaN | NaN | 1359.214286 | 7186.928571 | 1144.666667 |
| Maine | NaN | 1676.00 | 82.666667 | 881.000000 | NaN | 1669.400000 | NaN |
| Maryland | NaN | NaN | 225.320000 | NaN | NaN | 4006.937500 | 304.000000 |
| Massachusetts | NaN | 2891.00 | 384.250000 | NaN | NaN | 1556.571429 | 67.000000 |
| Michigan | 26435.333333 | NaN | 3635.250000 | 1.000000 | 1078.000000 | 4831.650602 | 2610.000000 |
| Minnesota | NaN | NaN | 369.500000 | NaN | NaN | 3585.545455 | NaN |
| Mississippi | NaN | NaN | 12.000000 | NaN | NaN | NaN | 300.000000 |
| Missouri | NaN | NaN | 408.000000 | NaN | NaN | 4483.818182 | 65.000000 |
| Montana | NaN | NaN | 93.000000 | 34.500000 | NaN | NaN | NaN |
| Nebraska | NaN | NaN | NaN | NaN | 159.000000 | 3221.333333 | NaN |
| Nevada | NaN | NaN | 553.285714 | NaN | NaN | NaN | NaN |
| New Hampshire | NaN | NaN | 60.000000 | NaN | NaN | 1597.500000 | NaN |
| New Jersey | NaN | NaN | 91.125000 | NaN | NaN | 6372.863636 | 748.500000 |
| New Mexico | NaN | 76.00 | 174.500000 | NaN | NaN | NaN | 0.000000 |
| New York | 247.000000 | 16687.25 | 309.083333 | NaN | 2655.000000 | 6034.575758 | 1176.571429 |
| North Carolina | NaN | NaN | 1063.750000 | NaN | NaN | 1738.933333 | 82.200000 |
| North Dakota | NaN | NaN | NaN | NaN | 720.000000 | NaN | NaN |
| Ohio | NaN | NaN | 327.285714 | NaN | NaN | 4322.269231 | 1744.500000 |
| Oklahoma | NaN | NaN | 75.666667 | 984.000000 | 704.000000 | 4206.466667 | NaN |
| Oregon | 200.000000 | NaN | 394.105263 | NaN | NaN | 2295.800000 | NaN |
| Pennsylvania | 376.000000 | NaN | 1526.833333 | NaN | NaN | 4314.000000 | 329.000000 |
| South Carolina | NaN | NaN | NaN | NaN | NaN | 3135.000000 | NaN |
| South Dakota | NaN | NaN | NaN | 120.000000 | NaN | NaN | NaN |
| Tennessee | 404.000000 | NaN | 171.000000 | NaN | 2700.000000 | 1386.350000 | 20.000000 |
| Texas | 405.600000 | 13920.00 | 298.769231 | NaN | 1140.411765 | 3854.890625 | 810.800000 |
| Utah | 15.000000 | NaN | 142.285714 | NaN | 2275.000000 | 957.000000 | 537.500000 |
| Vermont | NaN | NaN | 35.444444 | NaN | NaN | NaN | NaN |
| Virginia | NaN | NaN | 2.000000 | NaN | 683.500000 | 1132.281250 | 241.000000 |
| Washington | 1204.000000 | 1.00 | 371.870968 | 73.333333 | 248.000000 | 5473.550000 | 25.000000 |
| West Virginia | NaN | NaN | 1.000000 | NaN | NaN | 9305.000000 | NaN |
| Wisconsin | NaN | 33971.25 | 459.000000 | NaN | 388.000000 | 1527.428571 | NaN |
| Wyoming | 61.000000 | NaN | 0.333333 | 32.000000 | NaN | 106.000000 | NaN |
NMAR analysis on column CAUSE.CATEGORY.DETAIL. This column appears to be documented and written by researchers, as the labels used for detailed causes are quite messy and inconsistent. For example, there are two very similar labels "Coal" and " Coal", both of which corresponds to a power outage caused by a coal power plant issue. Another occurance is the various notations of wind damage, including "heavy wind", "wind/rain", "wind storm", and "wind". These clues imply that this column is reported by hand, and the names of each label varies from one person to another. Therefore, it is very likely that the missing values are an incident of human error while collecting the information. If the cause details are unknown to the researcher, or the causes are quite obvious and not worth writing its details, then the researcher is more likely to not write anything within this column. And so, the missing values are depended on the missing values itself.
#unique values in CAUSE.CATEGORY.DETAIL
outage['CAUSE.CATEGORY.DETAIL'].unique()
array([nan, 'vandalism', 'heavy wind', 'thunderstorm', 'winter storm',
'tornadoes', 'sabotage', 'hailstorm', 'uncontrolled loss',
'winter', 'wind storm', 'computer hardware', 'public appeal',
'storm', ' Coal', ' Natural Gas', 'hurricanes', 'wind/rain',
'snow/ice storm', 'snow/ice ', 'transmission interruption',
'flooding', 'transformer outage', 'generator trip',
'relaying malfunction', 'transmission trip', 'lightning',
'switching', 'shed load', 'line fault', 'breaker trip', 'wildfire',
' Hydro', 'majorsystem interruption', 'voltage reduction',
'transmission', 'Coal', 'substation', 'heatwave',
'distribution interruption', 'wind', 'suspicious activity',
'feeder shutdown', '100 MW loadshed', 'plant trip', 'fog', 'Hydro',
'earthquake', 'HVSubstation interruption', 'cables', 'Petroleum',
'thunderstorm; islanding', 'failure'], dtype=object)
#outage duration dependency on CAUSE.CATEGORY
od_mar_test = outage[['CAUSE.CATEGORY', 'OUTAGE.DURATION']].copy()
od_mar_test['od_missing'] = od_mar_test['OUTAGE.DURATION'].isnull()
od_dist = od_mar_test.pivot_table(index='CAUSE.CATEGORY', columns='od_missing', aggfunc='size')
od_dist = od_dist.fillna(0)/od_dist.sum()
od_dist
| od_missing | False | True |
|---|---|---|
| CAUSE.CATEGORY | ||
| equipment failure | 0.037263 | 0.086207 |
| fuel supply emergency | 0.025745 | 0.224138 |
| intentional attack | 0.273035 | 0.258621 |
| islanding | 0.029810 | 0.034483 |
| public appeal | 0.046748 | 0.000000 |
| severe weather | 0.504065 | 0.327586 |
| system operability disruption | 0.083333 | 0.068966 |
od_dist.plot(kind='barh', barmode='group')
#total variation distance
observed_tvd = od_dist.diff(axis=1).iloc[:, -1].abs().sum() / 2
observed_tvd
0.2520091580226147
#permutation test
shuffled_od = od_mar_test.copy()
perm_tvds = []
n_repetitions = 5000
for i in range(n_repetitions):
shuffled_od['od_missing'] = np.random.permutation(shuffled_od['od_missing'])
shuffled_dist = shuffled_od.pivot_table(index='CAUSE.CATEGORY', columns='od_missing', aggfunc='size').apply(lambda x: x/x.sum())
perm_tvd = shuffled_dist.diff(axis=1).iloc[:, -1].abs().sum() / 2
perm_tvds.append(perm_tvd)
shuffled_dist.plot(kind='barh', barmode='group')
od_p_test = px.histogram(pd.DataFrame(perm_tvds), x=0, nbins=100, histnorm='probability',
title='Empirical Distribution of the TVD')
od_p_test.add_vline(x=observed_tvd, line_color='red')
#p-value for this test
(np.array(perm_tvds) > observed_tvd).mean()
#Missingness of DEMAND.LOSS.MW does depend on CAUSE.CATEGORY, MAR
0.0004
Second Permutation Test
#outage duration missingness dependency on customers affected
not_mar_test = outage[['OUTAGE.DURATION', 'CUSTOMERS.AFFECTED']].copy()
not_mar_test['DURATION.MISSING'] = not_mar_test['OUTAGE.DURATION'].isnull()
not_mar_test
| OUTAGE.DURATION | CUSTOMERS.AFFECTED | DURATION.MISSING | |
|---|---|---|---|
| 0 | 3060.0 | 70000.0 | False |
| 1 | 1.0 | NaN | False |
| 2 | 3000.0 | 70000.0 | False |
| 3 | 2550.0 | 68200.0 | False |
| 4 | 1740.0 | 250000.0 | False |
| ... | ... | ... | ... |
| 1529 | 720.0 | 34500.0 | False |
| 1530 | NaN | NaN | True |
| 1531 | 59.0 | NaN | False |
| 1532 | 181.0 | NaN | False |
| 1533 | NaN | 14273.0 | True |
1534 rows × 3 columns
px.histogram(not_mar_test, x='CUSTOMERS.AFFECTED', color='DURATION.MISSING', barmode='overlay', opacity=0.7)
#use absolute difference in mean of the two distributions
observed_adm = float(abs(np.diff(not_mar_test.groupby('DURATION.MISSING')['CUSTOMERS.AFFECTED'].mean())))
observed_adm
20599.732900432893
#permutation test
n_repetitions = 5000
perm_adms = []
shuffled_od = not_mar_test.copy()
for i in range(n_repetitions):
shuffled_od['DURATION.MISSING'] = np.random.permutation(shuffled_od['DURATION.MISSING'])
perm_adm = float(abs(np.diff(shuffled_od.groupby('DURATION.MISSING')['CUSTOMERS.AFFECTED'].mean())))
perm_adms.append(perm_adm)
#p-value for dependency on customers effected
(np.array(perm_adms) > observed_adm).mean()
#not depended, no conclusion
0.6652
od_p_test = px.histogram(pd.DataFrame(perm_adms), x=0, nbins=100, histnorm='probability',
title='Empirical Distribution of the Average Difference of Means')
od_p_test.add_vline(x=observed_adm, line_color='red')
*Null Hypothesis:* Severe weather related outage durations *are* randomly sampled from the population of outage duration.
*Alternative Hypothesis:* Severe weather related outage durations *are not* randomly sampled from the population of outage duration.
*Observation:* outage durations caused by severe weather
*Population:* all outage durations (from data)
*Test Statistic:* mean of sampled durations
*Sample Size:* number of outage durations that has been categorized as caused by severe weather
#hypothesis testing
hypothesis_test = outage[outage['OUTAGE.DURATION'].notnull()].copy()
sample_n = hypothesis_test[hypothesis_test['CAUSE.CATEGORY'] == 'severe weather'].shape[0]
observed_mean = hypothesis_test.loc[hypothesis_test['CAUSE.CATEGORY'] == 'severe weather', 'OUTAGE.DURATION'].mean()
ht_means = []
N_repetitions = 100_000
for i in range(N_repetitions):
sampled_durations = hypothesis_test['OUTAGE.DURATION'].sample(sample_n)
ht_means.append(sampled_durations.mean())
#p-value
(ht_means > observed_mean).mean()
0.0
hypothesis_test = px.histogram(pd.DataFrame(ht_means), x=0, nbins=100, histnorm='probability',
title='Empirical Distribution of the Mean of Sampled Durations')
hypothesis_test.add_vline(x=observed_mean, line_color='red')
# conclusion: reject null